import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import os
#now we are importing the directories from our dataset folders
os.listdir(r"C:\Users\hp\Desktop\DataAnalysis\Datasets-20240722T151923Z-001\Datasets")
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
#now the pandas.read_csv() to import and read the csv file of our Jan-June 2015 dataset of Uber
uber_15=pd.read_csv(r'C:\Users\hp\Desktop\DataAnalysis\Datasets-20240722T151923Z-001\Datasets\uber-raw-data-janjune-15.csv')
uber_15.shape
(14270479, 4)
uber_15.duplicated().sum() #here we found how many values are duplocated in the data frame , to proceed with our data cleaning work
898225
uber_15.drop_duplicates(inplace=True)# removed all the duplicated values from the dataframe
uber_15.duplicated().sum() #to check if all the duplicates are removed or not
0
uber_15.dtypes
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
uber_15.isnull().sum()#to find all places with missing values
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 160702 locationID 0 dtype: int64
uber_15['Pickup_date'][0]
'2015-05-17 09:47:00'
#converting the datatype of the PickupDate to Timestamp type (originally is a string)
uber_15['Pickup_date']=pd.to_datetime(uber_15['Pickup_date'])
# type conversion of data so that we can use it in better ways (data cleaning)
type(uber_15['Pickup_date'][0])
pandas._libs.tslibs.timestamps.Timestamp
uber_15['month']=uber_15['Pickup_date'].dt.month_name()
#extracting the months ( with its name )
uber_15['month']
0 May
1 May
2 May
3 May
4 May
...
14270474 May
14270475 May
14270476 May
14270477 May
14270478 May
Name: month, Length: 13372254, dtype: object
uber_15['month'].value_counts().plot(color='red',marker="o")
#This is a method applied to the 'month' Series (a single column) to
#create a Series that counts the occurrences of each unique month value.
#It essentially performs frequency analysis on the month data.
<Axes: xlabel='month'>
uber_15['days']=uber_15['Pickup_date'].dt.day_name()
uber_15['date']=uber_15['Pickup_date'].dt.day
uber_15['hour']=uber_15['Pickup_date'].dt.hour
uber_15['minute']=uber_15['Pickup_date'].dt.minute
#creating Series for all the needed values like the days , dates , hour , minute
uber_15.head()
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | days | date | hour | minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 | May | Sunday | 17 | 9 | 47 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 | May | Sunday | 17 | 9 | 47 |
| 2 | B02617 | 2015-05-17 09:47:00 | B02617 | 100 | May | Sunday | 17 | 9 | 47 |
| 3 | B02617 | 2015-05-17 09:47:00 | B02774 | 80 | May | Sunday | 17 | 9 | 47 |
| 4 | B02617 | 2015-05-17 09:47:00 | B02617 | 90 | May | Sunday | 17 | 9 | 47 |
pivot=pd.crosstab(index=uber_15['month'],columns=uber_15['days'])
#pd.crosstab() in pandas creates a frequency table
#summarizing how often combinations
#of categorical variables occur in your data.
pivot
#created a DataFrame with index of the months and the columns as the days
| days | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| month | |||||||
| April | 315002 | 238429 | 324545 | 273560 | 372522 | 250632 | 338015 |
| February | 373550 | 274948 | 368311 | 296130 | 335603 | 287260 | 286387 |
| January | 339285 | 190606 | 386049 | 230487 | 330319 | 196574 | 245650 |
| June | 371225 | 375312 | 399377 | 334434 | 357782 | 405500 | 328141 |
| March | 309631 | 269931 | 314785 | 313865 | 277026 | 320634 | 256767 |
| May | 430134 | 255501 | 464298 | 390391 | 337607 | 290004 | 316045 |
type(pivot)
pandas.core.frame.DataFrame
pivot.plot(kind='bar',figsize= (10,4),fontsize=10).legend(frameon=False, loc='upper left', ncol=2 ,fontsize='8')
<matplotlib.legend.Legend at 0x1aa7dd83d10>
summary=uber_15.groupby(['days','hour'], as_index=False).size()
sns.pointplot(x='hour',y='size',hue='days',data=summary).legend(fontsize='8')
<matplotlib.legend.Legend at 0x1aa7235e9d0>
os.listdir(r"C:\Users\hp\Desktop\DataAnalysis\Datasets-20240722T151923Z-001\Datasets")
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
uber_foil=pd.read_csv(r"C:\Users\hp\Desktop\DataAnalysis\Datasets-20240722T151923Z-001\Datasets\Uber-Jan-Feb-FOIL.csv")
uber_foil.head()
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
!pip install chart_studio
!pip install plotly
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: chart_studio in c:\users\hp\appdata\roaming\python\python311\site-packages (1.1.0) Requirement already satisfied: plotly in d:\new folder\lib\site-packages (from chart_studio) (5.9.0) Requirement already satisfied: requests in d:\new folder\lib\site-packages (from chart_studio) (2.31.0) Requirement already satisfied: retrying>=1.3.3 in c:\users\hp\appdata\roaming\python\python311\site-packages (from chart_studio) (1.3.4) Requirement already satisfied: six in d:\new folder\lib\site-packages (from chart_studio) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in d:\new folder\lib\site-packages (from plotly->chart_studio) (8.2.2) Requirement already satisfied: charset-normalizer<4,>=2 in d:\new folder\lib\site-packages (from requests->chart_studio) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in d:\new folder\lib\site-packages (from requests->chart_studio) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in d:\new folder\lib\site-packages (from requests->chart_studio) (1.26.16) Requirement already satisfied: certifi>=2017.4.17 in d:\new folder\lib\site-packages (from requests->chart_studio) (2023.7.22) Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: plotly in d:\new folder\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in d:\new folder\lib\site-packages (from plotly) (8.2.2)
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
uber_foil.columns
Index(['dispatching_base_number', 'date', 'active_vehicles', 'trips'], dtype='object')
px.box(x='dispatching_base_number',y='active_vehicles',data_frame=uber_foil)
#5 point summary of data
px.violin(x='dispatching_base_number',y='active_vehicles',data_frame=uber_foil)
os.listdir(r"C:\Users\hp\Desktop\DataAnalysis\Datasets-20240722T151923Z-001\Datasets")
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files = os.listdir(r"C:\Users\hp\Desktop\DataAnalysis\Datasets-20240722T151923Z-001\Datasets")[-8:]
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15_sample.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
final = pd.DataFrame()
path=r"C:\Users\hp\Desktop\DataAnalysis\Datasets-20240722T151923Z-001\Datasets"
for file in files :
current_df = pd.read_csv(path + '\\' + file)
final = pd.concat([current_df,final])
final.shape
(4534327, 4)
final.duplicated().sum()
82581
final.drop_duplicates(inplace=True)
final.shape
(4451746, 4)
final.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
| 3 | 9/1/2014 0:06:00 | 40.7450 | -73.9889 | B02512 |
| 4 | 9/1/2014 0:11:00 | 40.8145 | -73.9444 | B02512 |
rush_uber = final.groupby(['Lat' , 'Lon'] , as_index=False).size()
rush_uber.head()
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
!pip install folium
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: folium in c:\users\hp\appdata\roaming\python\python311\site-packages (0.17.0) Requirement already satisfied: branca>=0.6.0 in c:\users\hp\appdata\roaming\python\python311\site-packages (from folium) (0.7.2) Requirement already satisfied: jinja2>=2.9 in d:\new folder\lib\site-packages (from folium) (3.1.2) Requirement already satisfied: numpy in d:\new folder\lib\site-packages (from folium) (1.24.3) Requirement already satisfied: requests in d:\new folder\lib\site-packages (from folium) (2.31.0) Requirement already satisfied: xyzservices in d:\new folder\lib\site-packages (from folium) (2022.9.0) Requirement already satisfied: MarkupSafe>=2.0 in d:\new folder\lib\site-packages (from jinja2>=2.9->folium) (2.1.1) Requirement already satisfied: charset-normalizer<4,>=2 in d:\new folder\lib\site-packages (from requests->folium) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in d:\new folder\lib\site-packages (from requests->folium) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in d:\new folder\lib\site-packages (from requests->folium) (1.26.16) Requirement already satisfied: certifi>=2017.4.17 in d:\new folder\lib\site-packages (from requests->folium) (2023.7.22)
import folium
basemap = folium.Map()
from folium.plugins import HeatMap
HeatMap(rush_uber).add_to(basemap)
<folium.plugins.heat_map.HeatMap at 0x1aaccfc6cd0>
basemap